In this document we explore the METABRIC dataset.
print(paste('Number of samples: ', length(unique(cells$ImageNumber))))
## [1] "Number of samples: 794"
print(paste('Number of cell types: ', length(unique(cells$meta_description))))
## [1] "Number of cell types: 32"
paste('Cell types:')
## [1] "Cell types:"
print(unique(cells$meta_description))
## [1] "CK^{med}ER^{lo}" "ER^{hi}CXCL12^{+}"
## [3] "CD4^{+} T cells & APCs" "CD4^{+} T cells"
## [5] "Endothelial" "Fibroblasts"
## [7] "Myofibroblasts PDPN^{+}" "CD8^{+} T cells"
## [9] "CK8-18^{hi}CXCL12^{hi}" "Myofibroblasts"
## [11] "CK^{lo}ER^{lo}" "Macrophages"
## [13] "CK^{+} CXCL12^{+}" "CK8-18^{hi}ER^{lo}"
## [15] "CK8-18^{+} ER^{hi}" "CD15^{+}"
## [17] "MHC I & II^{hi}" "T_{Reg} & T_{Ex}"
## [19] "CD57^{+}" "Ep Ki67^{+}"
## [21] "CK^{lo}ER^{med}" "Macrophages & granulocytes"
## [23] "CD38^{+} lymphocytes" "Ki67^{+}"
## [25] "HER2^{+}" "B cells"
## [27] "Basal" "Fibroblasts FSP1^{+}"
## [29] "Granulocytes" "MHC I^{hi}CD57^{+}"
## [31] "Ep CD57^{+}" "MHC^{hi}CD15^{+}"
ggplot(data= cells %>% count(ImageNumber)) + geom_bar(aes(y=n, x=ImageNumber), stat='identity') + theme_bw() + ggtitle('Total counts per sample')
if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'totalcounts.pdf', sep=''))
}
## Saving 7 x 5 in image
ggplot(data=cells %>% group_by(ImageNumber) %>% count(meta_description)) + geom_boxplot(aes(x=meta_description, y=n)) + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('Cell type counts per sample') + ylim(0,100)
## Warning: Removed 2783 rows containing non-finite values (`stat_boxplot()`).
if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'samplecounts.pdf', sep=''))
}
## Saving 7 x 5 in image
## Warning: Removed 2783 rows containing non-finite values (`stat_boxplot()`).
cell_occurences <- merge(cell_counts %>% expand(tnumber, meta_description), cell_counts, by = c('tnumber','meta_description'), all.x = TRUE) %>%
mutate(n = coalesce(n, 0)) %>%
mutate(category = ifelse(n==0,'0', ifelse(n<10, '0<10', ifelse(n<50,'10<50','>50')))) %>% group_by(meta_description) %>%
count(category)
lvls <- cell_occurences %>% filter(category == '0') %>% arrange(desc(n)) %>% pull(meta_description)
#Haakje: Zijn de samples met hoge counts in cell types met veel 0 counts, verrijkt in bepaaplde subtypes of iets dergelijks?
ggplot(cell_occurences, aes(x=factor(meta_description, level = lvls), y=n, fill=factor(category, levels=c('>50','10<50','0<10','0')))) +
geom_bar(position="fill", stat="identity") + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle('Cell type counts in samples') + ylab('proportion') + xlab('')+ guides(fill=guide_legend(title="occurences"))
if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'celltypecounts.pdf', sep=''))
}
## Saving 7 x 5 in image
Images from the METABRIC cohort are small compared to the NABUCCO data and therefore have a lower cell count.
nabucco_data <- read_tsv(here('DATA/nabucco_1_densities.tsv'))
## Rows: 382 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): tnumber, phenotype, assigned_loc
## dbl (3): n, area, density
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(data= nabucco_data %>% group_by(tnumber) %>% count(n)) + geom_bar(aes(y=n, x=tnumber), stat='identity') + theme_bw() + ggtitle('Total counts per sample in NABUCCO cohort') + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'NABUCCO_totalcounts.pdf', sep=''))
}
## Saving 7 x 5 in image
ggplot(data=nabucco_data) + geom_boxplot(aes(x=phenotype, y=n)) + theme_bw() + ylim(0,1000) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('Cell type counts per sample in NABUCCO cohort')
## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).
if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'NABUCCO_samplecounts.pdf', sep=''))
}
## Saving 7 x 5 in image
## Warning: Removed 195 rows containing non-finite values (`stat_boxplot()`).
NABUCCO_cell_occurences <- merge(nabucco_data %>% expand(tnumber, phenotype), nabucco_data, by = c('tnumber','phenotype'), all.x = TRUE) %>%
mutate(n = coalesce(n, 0)) %>%
mutate(category = ifelse(n==0,'0', ifelse(n<10, '0<10', ifelse(n<50,'10<50','>50')))) %>% group_by(phenotype) %>%
count(category)
lvls <- NABUCCO_cell_occurences %>% filter(category == '>50') %>% arrange(n) %>% pull(phenotype)
ggplot(NABUCCO_cell_occurences, aes(x=factor(phenotype, level = lvls), y=n, fill=factor(category, levels=c('>50','10<50','0<10','0')))) +
geom_bar(position="fill", stat="identity") + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle('Cell type counts in samples in NABUCCO cohort') + ylab('proportion') + xlab('')+ guides(fill=guide_legend(title="occurences"))
if (params$save_files){
ggsave(paste(here('output/Data_exploration/'),'NABUCCO_celltypecounts.pdf', sep=''))
}
## Saving 7 x 5 in image
In future steps we are going to estimate distances between cell type combinations. Not every combination is present in every sample however. We have a large sample size and this possibly compensates for the low cell type count. Cell types that are too rare cannot be considered.
cell_occurences_combinations <- getCombinationCounts()
print(paste('number of samples that cannot be estimated: ', nrow(cell_occurences_combinations %>% filter(n_from == 0 | n_to ==0))))
## [1] "number of samples that cannot be estimated: 498295"
heatmap_occurences <- function(dataset, occ_number){
cell_occurences_categories <- dataset %>% mutate(category = ifelse(n_to > occ_number & n_from> occ_number, paste(as.character(occ_number),'+',sep=''), as.character(occ_number))) %>%
group_by(phenotype_from, phenotype_to) %>%
count(category) %>% mutate(n = n/792)
df <- cell_occurences_categories %>% filter(category == paste(as.character(occ_number),'+',sep=''))
trans_df <- xtabs(n ~ phenotype_from + phenotype_to, df)
hm <- Heatmap(trans_df, cluster_rows = T, cluster_columns = T, row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
name = paste('sample percentage \n with', paste(as.character(occ_number),'+',sep=''), 'occ.'), column_title = 'phenotype_to', row_title = 'phenotype_from')
print(hm)
if (params$save_files){
save_pdf(hm, paste(here('output/Data_exploration/'), occ_number, 'plus_occurences_heatmap.pdf', sep=''), width=6, height=4)
}
}
heatmap_occurences(cell_occurences_combinations, 0)
heatmap_occurences(cell_occurences_combinations, 5)
heatmap_occurences(cell_occurences_combinations, 10)
heatmap_occurences(cell_occurences_combinations, 20)
Look in each subtype.
cell_occurences_combinations <- getCombinationCounts()
heatmap_occurences <- function(dataset, occ_number, subtype){
cell_occurences_categories <- dataset %>% mutate(category = ifelse(n_to > occ_number & n_from> occ_number, paste(as.character(occ_number),'+',sep=''), as.character(occ_number))) %>%
group_by(phenotype_from, phenotype_to) %>%
count(category) %>%
mutate(n = n/ length(unique(dataset %>% pull(tnumber))))
df <- cell_occurences_categories %>% filter(category == paste(as.character(occ_number),'+',sep=''))
trans_df <- xtabs(n ~ phenotype_from + phenotype_to, df)
hm <- Heatmap(trans_df, cluster_rows = F, cluster_columns = F, row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6),
name = paste('sample percentage \n of type', subtype, '\n with', paste(as.character(occ_number),'+',sep=''), 'occ.'), column_title = 'phenotype_to', row_title = 'phenotype_from')
print(hm)
}
heatmap_occurences(cell_occurences_combinations, 0,'ALL')
subtypes = group_split(clinical_data %>% group_by(PAM50))
for (s in 1:length(subtypes)){
subtype = subtypes[[s]]
heatmap_occurences(cell_occurences_combinations %>% filter(tnumber %in% subtype$ImageNumber), 0,unique(subtype$PAM50))
}
If we remove combinations in samples with a threshold count, the number of samples decreases fast.
thresholds <- tibble(threshold = c(0:200))
compute_from <- function(t){
return(nrow(cell_occurences_combinations %>% filter(n_to >= t)))
}
compute_both <- function(t){
return(nrow(cell_occurences_combinations %>% filter(n_to >= t & n_from >= t)))
}
n_from <- apply(thresholds,1, compute_from)
n_both <- apply(thresholds,1, compute_both)
thresholds <- cbind(thresholds, n_from, n_both)
ggplot(data=thresholds) + geom_point(aes(x=threshold, y=n_from, color = 'phenotype_from OR phenotype_to')) +
geom_point(aes(x=threshold, y=n_both, color='phenotype_from AND phenotype_to')) + theme_bw() + ggtitle('Number of samples with at least x cells') + ylab('count') + xlab('n cells') + xlim(0,50)
## Warning: Removed 150 rows containing missing values (`geom_point()`).
## Removed 150 rows containing missing values (`geom_point()`).
if (params$save_files){
ggsave(here('output/Data_exploration/occurence_slope.pdf'))
}
## Saving 7 x 5 in image
## Warning: Removed 150 rows containing missing values (`geom_point()`).
## Removed 150 rows containing missing values (`geom_point()`).
Look at the distribution of breast cancer subtypes.
clinical_data <- read_csv(here('DATA/IMCClinical.csv'))
## Rows: 709 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): metabric_id, ERStatus, LymphNodesOrdinal, sizeOrdinal, PAM50, IntClust
## dbl (3): Grade, yearsToStatus, DeathBreast
## lgl (2): ERBB2_pos, isValidation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinical_data_ext <- read_tsv(here('DATA/brca_metabric_clinical_data.tsv'))
## Rows: 2509 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (27): Study ID, Patient ID, Sample ID, Type of Breast Surgery, Cancer Ty...
## dbl (12): Age at Diagnosis, Cohort, Neoplasm Histologic Grade, Lymph nodes e...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
clinical_data_Rueda <- read.delim(here('DATA/41586_2019_1007_MOESM8_ESM.txt')) #https://www-nature-com.tudelft.idm.oclc.org/articles/s41586-019-1007-8#MOESM7
contingency_table <- function(feature1, feature2){
clinical_data <- merge(clinical_data %>% select(c('metabric_id', feature1)),
clinical_data_ext %>% select(c('Sample ID', feature2)), by.x='metabric_id', by.y='Sample ID',
all.x=T)
table(clinical_data[,2], clinical_data[,3])
}
order <- tibble(intclust = c("IntClust 1", "IntClust 2", "IntClust 3", "IntClust 4-", "IntClust 4+", "IntClust 5-", "IntClust 5+", "IntClust 6", "IntClust 7", "IntClust 8", "IntClust 9", "IntClust 10"))
# Contingency tables
contingency_table('PAM50',"Pam50 + Claudin-low subtype")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(feature1)
##
## # Now:
## data %>% select(all_of(feature1))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(feature2)
##
## # Now:
## data %>% select(all_of(feature2))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
##
## Basal claudin-low Her2 LumA LumB Normal
## Basal 43 45 0 0 0 0
## HER2 0 3 58 0 0 0
## Luminal A 0 11 0 217 0 0
## Luminal B 0 5 0 0 156 0
## Normal-like 0 27 0 0 0 46
contingency_table('PAM50',"3-Gene classifier subtype" )
##
## ER-/HER2- ER+/HER2- High Prolif ER+/HER2- Low Prolif HER2+
## Basal 67 2 1 4
## HER2 8 14 1 33
## Luminal A 1 53 156 7
## Luminal B 0 113 11 16
## Normal-like 14 8 40 4
contingency_table('IntClust',"Integrative Cluster" )
##
## 1 10 2 3 4ER- 4ER+ 5 6 7 8 9
## IntClust 1 49 0 0 0 0 0 0 0 0 0 0
## IntClust 10 0 61 0 0 0 0 0 0 0 0 0
## IntClust 2 0 0 29 0 0 0 0 0 0 0 0
## IntClust 3 0 0 0 84 0 0 0 0 0 0 0
## IntClust 4- 0 0 0 0 24 13 0 0 0 0 0
## IntClust 4+ 0 0 0 0 2 73 0 0 0 0 0
## IntClust 5- 0 0 0 0 0 0 32 0 0 0 0
## IntClust 5+ 0 0 0 0 0 0 32 0 0 0 0
## IntClust 6 0 0 0 0 0 0 0 28 0 0 0
## IntClust 7 0 0 0 0 0 0 0 0 60 0 0
## IntClust 8 0 0 0 0 0 0 0 0 0 80 0
## IntClust 9 0 0 0 0 0 0 0 0 0 0 44
contingency_table('ERStatus',"ER Status" )
##
## Negative Positive
## neg 104 18
## pos 28 452
contingency_table('ERStatus',"ER status measured by IHC" )
##
## Negative Positve
## neg 122 0
## pos 0 480
contingency_table_withindata <- function(feature1, feature2){
clinical_data <- merge(clinical_data %>% select(c('metabric_id', feature1)),
clinical_data %>% select(c('metabric_id', feature2)), by.x='metabric_id', by.y='metabric_id',
all.x=T)
table(clinical_data[,2], clinical_data[,3])
}
contingency_table_withindata('PAM50',"ERStatus" )
##
## neg pos
## Basal 64 22
## HER2 39 21
## Luminal A 5 220
## Luminal B 5 154
## Normal-like 9 63
contingency_table_Rueda <- function(feature1, feature2){
clinical_data <- merge(clinical_data %>% select(c('metabric_id', feature1)),
clinical_data_Rueda %>% select(c('METABRIC.ID', feature2)), by.x='metabric_id', by.y='METABRIC.ID',
all.x=T)
table(clinical_data[,2], clinical_data[,3])
}
contingency_table_Rueda('PAM50', "Pam50Subtype" )
##
## Basal Her2 LumA LumB Normal
## Basal 88 0 0 0 0
## HER2 0 61 0 0 0
## Luminal A 0 0 228 0 0
## Luminal B 0 0 0 161 0
## Normal-like 0 0 0 0 73
contingency_table_Rueda('IntClust',"iC10" )
##
## 1 10 2 3 4ER- 4ER+ 5 6 7 8 9
## IntClust 1 49 0 0 0 0 0 0 0 0 0 0
## IntClust 10 0 61 0 0 0 0 0 0 0 0 0
## IntClust 2 0 0 29 0 0 0 0 0 0 0 0
## IntClust 3 0 0 0 84 0 0 0 0 0 0 0
## IntClust 4- 0 0 0 0 24 13 0 0 0 0 0
## IntClust 4+ 0 0 0 0 2 73 0 0 0 0 0
## IntClust 5- 0 0 0 0 0 0 32 0 0 0 0
## IntClust 5+ 0 0 0 0 0 0 32 0 0 0 0
## IntClust 6 0 0 0 0 0 0 0 28 0 0 0
## IntClust 7 0 0 0 0 0 0 0 0 60 0 0
## IntClust 8 0 0 0 0 0 0 0 0 0 80 0
## IntClust 9 0 0 0 0 0 0 0 0 0 0 44
contingency_table_Rueda('ERStatus',"ER.Expr")
##
## - +
## neg 104 18
## pos 28 452
contingency_table_Rueda('ERStatus',"ER.Status")
##
## neg pos
## neg 122 0
## pos 0 480
# Intclust composition
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"PR Status" )),by.x='intclust',by.y='row.names') %>% rename('PR_positive' = 'Positive', 'PR_negative' = 'Negative')
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"ER Status" )),by.x='intclust',by.y='row.names') %>% rename('ER_positive' = 'Positive', 'ER_negative' = 'Negative')
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"HER2 Status" )),by.x='intclust',by.y='row.names') %>% rename('HER2_positive' = 'Positive', 'HER2_negative' = 'Negative')
melt_order <- reshape2::melt(order, id=c('intclust'))
ggplot(data=melt_order %>% filter(variable=='HER2_positive' | variable=='HER2_negative')) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('HER2 counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`
ggplot(data=melt_order %>% filter(variable=='ER_positive' | variable=='ER_negative')) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('ER counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`
ggplot(data=melt_order %>% filter(variable=='PR_positive' | variable=='PR_negative')) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('PR counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`
order <- tibble(intclust = c("IntClust 1", "IntClust 2", "IntClust 3", "IntClust 4-", "IntClust 4+", "IntClust 5-", "IntClust 5+", "IntClust 6", "IntClust 7", "IntClust 8", "IntClust 9", "IntClust 10"))
order <- merge(order, as.data.frame.matrix(contingency_table('IntClust',"Pam50 + Claudin-low subtype" )),by.x='intclust',by.y='row.names')
melt_order <- reshape2::melt(order, id=c('intclust'))
ggplot(data=melt_order) + geom_col(aes(x=intclust, y=value,fill=variable),stat='identity',position='fill') + theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + ggtitle('PAM50 + claudin-low counts in Intclusts')
## Warning in geom_col(aes(x = intclust, y = value, fill = variable), stat =
## "identity", : Ignoring unknown parameters: `stat`
# subtypes in data
ggplot(data = clinical_data) + geom_bar(aes(x=PAM50)) + theme_bw() + ggtitle('breast cancer subtype occurences')
if (params$save_files){
ggsave(here('output/Data_exploration/subtypes.pdf'))
}
## Saving 7 x 5 in image
Look at sampled images.
TME = c("B cells","CD38^{+} lymphocytes","CD4^{+} T cells","CD4^{+} T cells & APCs","CD57^{+}","CD8^{+} T cells","Endothelial","Fibroblasts","Fibroblasts FSP1^{+}","Granulocytes", "Ki67^{+}","Macrophages","Macrophages & granulocytes", "Myofibroblasts","Myofibroblasts PDPN^{+}", "T_{Reg} & T_{Ex}")
tumor = setdiff(unique(cells %>% pull(meta_description)), TME)
samples <- sample(unique(cells$ImageNumber),5)
for (s in samples){
print(show_slide(s, tumor))
}
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
i = 1
print(show_slide(i,tumor))
## Warning in as_grob.default(plot): Cannot convert object of class numeric into a
## grob.
ggsave(paste('output/Slides/exploreslide_',i,'.pdf',sep=''), width = 15, height=10)
Cell types are classified based on expression profiles.
library(ComplexHeatmap)
expression_profiles_summary <- as.data.frame(cells %>% group_by(meta_description) %>%
summarise_at(vars("Histone H3":"DNA2"), mean))
rownames(expression_profiles_summary) <- expression_profiles_summary$meta_description
expression_profiles_summary <- scale(subset(expression_profiles_summary, select=-c(meta_description)))
hm <- Heatmap(expression_profiles_summary, name = 'mean expression', column_title = 'protein', row_title = 'cell type',
row_names_gp = gpar(fontsize=6), column_names_gp = gpar(fontsize=6), cluster_rows = T, cluster_columns = T)
print(hm)
if (params$save_files){
save_pdf(hm, here('output/Data_exploration/expression_profiles.pdf'), width=6, height=4)
}